Visualization of the Segmentation by municipalities using Python.
Virginia Jimenez Mohedano (LinkedIn) and Raazesh Sainudiin (LinkedIn).
This project was supported by UAB SENSMETRY through a Data Science Thesis Internship
between 2022-01-17 and 2022-06-05 to Virginia J.M. and
Databricks University Alliance with infrastructure credits from AWS to
Raazesh Sainudiin, Department of Mathematics, Uppsala University, Sweden.
2022, Uppsala, Sweden
# Reading accident frequencies for each municipality previously obtained
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, DoubleType
schema = StructType() \
.add("municipality", StringType(), True) \
.add("frequency", DoubleType(), True)
municipality_freq = spark.read.format("csv").option("header", True).schema(schema).load("dbfs:/datasets/lithuania/municipalities_freq.csv")
municipality_freq.show(1000)
+--------------------+--------------------+
| municipality| frequency|
+--------------------+--------------------+
|Kaišiadorių rajon...|0.010304096506659964|
|Kelmės rajono sav...|0.010304096506659964|
|Pakruojo rajono s...|0.003853564547206...|
|Skuodo rajono sav...|0.003853564547206...|
|Elektrėnų savival...|0.004356203401189578|
|Kazlų Rūdos saviv...|0.004356203401189578|
|Neringos savivaldybė|0.001507916561950...|
|Birštono savivaldybė|0.001507916561950...|
|Šalčininkų rajono...|0.009047499371701432|
|Švenčionių rajono...|0.005277707966825836|
|Radviliškio rajon...|0.011979559353271342|
|Vilkaviškio rajon...|0.008628633660048589|
|Širvintų rajono s...|0.003518471977883...|
|Klaipėdos miesto ...|0.061405713328306945|
|Panevėžio miesto ...| 0.04959370025969674|
|Panevėžio rajono ...| 0.02345647985255927|
|Kėdainių rajono s...|0.013990114769204993|
|Mažeikių rajono s...|0.013236156488229874|
|Anykščių rajono s...| 0.00636675881712323|
|Šiaulių miesto sa...|0.039205830610706205|
|Klaipėdos rajono ...| 0.02421043813353439|
|Šilutės rajono sa...|0.016419535896791487|
|Raseinių rajono s...|0.014660299907849544|
|Tauragės rajono s...|0.013822568484543855|
|Rokiškio rajono s...|0.008293541090726313|
|Ukmergės rajono s...|0.008042221663734606|
|Šilalės rajono sa...|0.006199212532462093|
|Molėtų rajono sav...|0.005612800536148...|
|Joniškio rajono s...|0.005445254251486974|
|Kupiškio rajono s...|0.005193934824495267|
|Akmenės rajono sa...|0.004272430258859...|
|Ignalinos rajono ...|0.003602245120214459|
|Šiaulių rajono sa...| 0.01792745245874173|
|Plungės rajono sa...|0.015498031331155232|
|Telšių rajono sav...|0.014241434196196699|
|Kretingos rajono ...| 0.01130937421462679|
|Pasvalio rajono s...|0.010806735360643378|
|Palangos miesto s...|0.010387869648990534|
|Varėnos rajono sa...|0.007874675379073468|
|Lazdijų rajono sa...|0.004775069112842423|
|Jurbarko rajono s...|0.003937337689536734|
|Zarasų rajono sav...|0.002680740554578...|
|Vilniaus miesto s...| 0.19736952333082014|
|Vilniaus rajono s...| 0.03602245120214459|
|Jonavos rajono sa...|0.014073887911535563|
|Prienų rajono sav...|0.009382591941023708|
|Šakių rajono savi...|0.008963726229370864|
|Biržų rajono savi...|0.006701851386445506|
|Alytaus miesto sa...| 0.01348747591522158|
|Trakų rajono savi...|0.013403702772891012|
|Alytaus rajono sa...| 0.01206333249560191|
|Utenos rajono sav...|0.010555415933651672|
|Druskininkų saviv...|0.002596967412247...|
|Marijampolės savi...| 0.01357124905755215|
|Kauno miesto savi...| 0.12239256094496105|
|Kauno rajono savi...| 0.02906928038870738|
|Kalvarijos saviva...|0.002261874842925358|
| Pagėgių savivaldybė|0.001424143419619...|
|Visagino savivaldybė|0.002513194269917...|
| Rietavo savivaldybė|0.003183379408561615|
+--------------------+--------------------+
# Calculating colors
# https://matplotlib.org/stable/tutorials/colors/colormaps.html
from matplotlib.cm import viridis
from matplotlib.colors import to_hex
min_freq = municipality_freq.agg({"frequency":"min"}).collect()[0][0]
max_freq = municipality_freq.agg({"frequency":"max"}).collect()[0][0]
freq_range = max_freq - min_freq
def calculate_color(row):
freq = row["frequency"]
"""
Convert the freq to a color
"""
# make freq a number between 0 and 1
normalized_freq = (freq - min_freq) / freq_range
# This is because in viridis colormap, darker is lower values and we want the opposite
inverse_freq = 1-normalized_freq
# transform the freq coefficient to a matplotlib color
mpl_color = viridis(inverse_freq)
# transform from a matplotlib color to a valid CSS color
gmaps_color = to_hex(mpl_color, keep_alpha=False)
return (row["municipality"],gmaps_color)
# Calculate a color for each district
colors = municipality_freq.rdd.map(lambda row: calculate_color(row)).collectAsMap()
//Temporary copy of geojson so python can read it
dbutils.fs.cp("dbfs:/datasets/magellan/municipalities.geojson", "file:/databricks/driver/")
res0: Boolean = true
# Reading and processing geojson (map and borders)
import json
import gmaps
import gmaps.datasets
import gmaps.geojson_geometries
from ipywidgets.embed import embed_minimal_html
gmaps.configure(api_key="AIzaSyDEHHgMMS33M5AT8lav2Q-sem5KOyFx9Sc") # Your Google API key
# municipalities / Savivaldybės
municipalities = json.load(open('municipalities.geojson', 'r'))
# Removing municipality capitals
list_to_remove = []
i = 0
for feature in municipalities['features']:
if feature["geometry"]["type"] != "Polygon":
list_to_remove.append(i)
i+=1
# Removing what was found before
for index in sorted(list_to_remove, reverse=True):
del municipalities['features'][index]
# Order the colors by the geojson order
ordered_colors = []
for feature in municipalities['features']:
municipality = feature['properties']['name']
color = colors[municipality]
ordered_colors.append(color)
from pylab import *
# Generating map
fig = gmaps.figure()
freq_layer = gmaps.geojson_layer(
municipalities,
fill_color=ordered_colors,
fill_opacity=0.8,
stroke_color='black',
stroke_opacity=1.0,
stroke_weight=0.2)
fig.add_layer(freq_layer)
embed_minimal_html("export.html", views=[fig])
# Adding color legend to map
cmap = cm.get_cmap('viridis', 20)
gradient = ""
for i in reversed(range(cmap.N)):
rgba = cmap(i)
# rgb2hex accepts rgb or rgba
gradient = gradient + "," + matplotlib.colors.rgb2hex(rgba)
# Removing first comma
gradient = gradient[1:]
html_file_content = open("export.html", 'r').read()\
.replace("</head>", """<style>
.legend {
max-width: 430px;
}
.legend div{
background: linear-gradient(to right, """ + gradient + """);
border-radius: 4px;
padding: 10px;
}
.legend p {
text-align: justify;
text-justify: inter-word;
margin: 0px;
margin-block-start: 0em;
margin-block-end: 0em;
height: 1em;
}
.legend p:after {
content: "";
display: inline-block;
width: 100%;
}
</style>
</head>""")\
.replace("</body>","""
<h2>Relative frequency of accidents</h2>
<div class="legend">
<p>""" + str(round(min_freq,2)) + " " + str(round(max_freq,2)) +"""</p>
<div></div>
</div>
</body>""")
# !!!!!!!!!!!!!!!!!!!!!
# Can only be run once per cluster restart
displayHTML(html_file_content)